# -*- coding: cp936 -*-

#FileName:  PicCollector_New.py
#Author:    Pang Liang
#Time:      2012-10-3
#Descript:  Get N Pictures from Google Image with a Keyword List
import os, urllib
import re
import time

class AppURLopener(urllib.FancyURLopener):
    version = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.802.30 Safari/535.1 SE 2.X MetaSr 1.0"

def c2u(x):
    return x.decode('cp936').encode('utf-8')

def DownloadPictureFromGoogle(Tag,SampleCount,StartPageNum):
    dirname=Tag
    Tag=c2u(Tag)
    if not os.path.isdir(dirname):
        os.mkdir(dirname)
    urllib._urlopener = AppURLopener()
    urllib._urlopener.addheader("Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
    i=0;
    c=1
    while i<SampleCount:
        url="http://www.google.com.hk/search?q="+urllib.quote(Tag)+\
             "&num=10&hl=en&newwindow=1&safe=strict&gbv=2&biw=995&bih=486&site=imghp&tbm=isch&ijn="+str(c)+"&ei=MQFrUMynK4qUiQfTu4GYCA&sprg="+str(StartPageNum)+\
             "&page="+str(0)+"&start="+str(i+1)+"&csl=0"
        print url
        urllib.urlretrieve(url,'search')
        page=open('search').read()
        matches = re.compile(r'\"(http://t.\.gstatic.com/images\?q=tbn:.+?)\"').findall(page)
        print matches
        print "matches=%d"%len(matches)
        for m in matches:
            i+=1
            filename = dirname +"/"+ str(i)+'.jpg'
            urllib.urlretrieve(m , filename)
            print i
            if i>=SampleCount: break
        EndPageNum=re.compile(r'data-pg=([0-9]+)').findall(page)
        StartPageNum=int(EndPageNum[len(EndPageNum)-1])
        print StartPageNum
        c+=1

SampleCount=input('The number of each sample:')
for KeyWord in open('list.txt'):
    try:
        print KeyWord
        DownloadPictureFromGoogle(KeyWord.strip(),SampleCount,0)
    except:
        print 'Error!'
        print KeyWord
